Section 3: PCA plots

This section uses the counts data (all datasets) generated in Section 1 for cluseter analyses. Breifly, the counts data is imported in R, batch corrected using ComBat_seq, vst transformation and clustering is performed using DESeq2, and results visualized as PCA plots.

PCA plot for all libraries

Prerequisites

R packages required for this section are loaded

setwd("~/github/amnion.vs.other_RNASeq")
library(sva)
library(tidyverse)
library(DESeq2)
library(vsn)
library(pheatmap)
library(ggrepel)
library(RColorBrewer)
library(plotly)

Import datasets

The counts data and its associated metadata (coldata) are imported for analyses

counts = 'assets/new-counts.tsv'
groupFile = 'assets/new-batch.v2.tsv'
coldata <-
  read.csv(
    groupFile,
    row.names = 1,
    sep = "\t",
    stringsAsFactors = TRUE
  )
cts <- as.matrix(read.csv(counts, sep = "\t", row.names = "gene.ids"))

inspect the coldata

DT::datatable(coldata)

reorder columns of cts according to coldata rows. Check if it both files matches.

colnames(cts)
#>  [1] "BT_EVT_Okae.1"    "BT_SCT_Okae.1"    "BT_TSC_Okae.1"    "BT_EVT_Okae.2"   
#>  [5] "BT_SCT_Okae.2"    "BT_TSC_Okae.2"    "CT_EVT_Okae.1"    "CT_SCT_Okae.1"   
#>  [9] "CT_TSC_Okae.1"    "CT_EVT_Okae.2"    "CT_SCT_Okae.2"    "CT_TSC_Okae.2"   
#> [13] "CT_EVT_Okae.3"    "CT_SCT_Okae.3"    "CT_TSC_Okae.3"    "n_H9.1"          
#> [17] "n_H9.2"           "nTE_D1.1"         "nTE_D1.2"         "nTE_D2.1"        
#> [21] "nTE_D2.2"         "nTE_D3.1"         "nTE_D3.2"         "nCT_P3.1"        
#> [25] "nCT_P3.2"         "nCT_P10.1"        "nCT_P10.2"        "nCT_P15.1"       
#> [29] "nCT_P15.2"        "cR_nCT_P15.1"     "cR_nCT_P15.2"     "nCT_P15_iPSC.1"  
#> [33] "nCT_P15_iPSC.2"   "CT_Okae.1"        "CT_Okae.2"        "nST.1"           
#> [37] "nST.2"            "nEVT.1"           "nEVT.2"           "pH9.1"           
#> [41] "pH9.2"            "pBAP_D1.1"        "pBAP_D1.2"        "pBAP_D2.1"       
#> [45] "pBAP_D2.2"        "pBAP_D3.1"        "pBAP_D3.2"        "CT_7wk.1"        
#> [49] "CT_7wk.2"         "CT_9wk.1"         "CT_11wk.1"        "amnion_18w.1"    
#> [53] "amnion_9w.1"      "amnion_16w.1"     "amnion_16w.2"     "amnion_22w.1"    
#> [57] "amnion_9w.2"      "amnion_22w.2"     "H1_gt70_D8_BAP.1" "H1_gt70_D8_BAP.2"
#> [61] "H1_gt70_D8_BAP.3" "H1_lt40_D8_BAP.1" "H1_lt40_D8_BAP.2" "H1_lt40_D8_BAP.3"
#> [65] "H1_Yabe.1"        "H1_Yabe.2"        "H1_Yabe.3"        "amnion_Term.1"   
#> [69] "amnion_Term.2"    "amnion_Term.3"    "amnion_Term.4"    "amnion_Term.5"   
#> [73] "amnion_Term.6"    "amnion_Term.7"    "amnion_Term.8"   
#>  [ reached getOption("max.print") -- omitted 10 entries ]
all(rownames(coldata) %in% colnames(cts))
#> [1] TRUE
cts <- cts[, rownames(coldata)]

Batch correction

Using combat seq (SVA package) run batch correction - using bioproject ids as variable (dataset origin).

cov1 <- as.factor(coldata$authors)
adjusted_counts <- ComBat_seq(cts, batch = cov1, group = NULL)
#> Found 6 batches
#> Using null model in ComBat-seq.
#> Adjusting for 0 covariate(s) or covariate level(s)
#> Estimating dispersions
#> Fitting the GLM model
#> Shrinkage off - using GLM estimates for parameters
#> Adjusting the data
all(rownames(coldata) %in% colnames(cts))
#> [1] TRUE
cts <- cts[, rownames(coldata)]

run DESeq2

The batch corrected read counts are then used for running DESeq2 analyses

dds <- DESeqDataSetFromMatrix(countData = adjusted_counts,
                              colData = coldata,
                              design = ~ group)

transformation

vsd <- vst(dds, blind = FALSE)
pcaData <-
  plotPCA(vsd,
          intgroup = c("group", "authors"),
          returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

PCA plot (all)

PCA plot for the dataset that includes all libraries.

ggplotly(
  ggplot(pcaData, aes(
    PC1, PC2, color = group.1, shape = authors
  )) +
    scale_shape_manual(values = 1:8) +
    theme_bw() +
    theme(legend.title = element_blank()) +
    geom_point(size = 1, stroke = 1) +
    xlab(paste0("PC1: ", percentVar[1], "% variance")) +
    ylab(paste0("PC2: ", percentVar[2], "% variance"))
)

Fig 3.1: PCA plots (all samples)

PCA plot for differenticated cell libraries

Import datasets

The counts data and its associated metadata (coldata) are imported for analyses

setwd("~/github/amnion.vs.other_RNASeq")
counts2 = 'assets/counts-dotted.v3.tsv'
groupFile = 'assets/batch-dotted.v3.tsv'
coldata2 <-
  read.csv(
    groupFile,
    row.names = 1,
    sep = "\t",
    stringsAsFactors = TRUE
  )
cts2 <- as.matrix(read.csv(counts2, sep = "\t", row.names = "gene.ids"))

inspect the coldata

DT::datatable(coldata2)

reorder columns of cts according to coldata rows. Check if it both files matches.

all(rownames(coldata2) %in% colnames(cts2))
#> [1] TRUE
cts2 <- cts2[, rownames(coldata2)]

Batch correction

Using combat seq (SVA package) run batch correction - using bioproject ids as variable (dataset origin).

cov1 <- as.factor(coldata2$authors)
adjusted_counts <- ComBat_seq(cts2, batch = cov1, group = NULL)
#> Found 3 batches
#> Using null model in ComBat-seq.
#> Adjusting for 0 covariate(s) or covariate level(s)
#> Estimating dispersions
#> Fitting the GLM model
#> Shrinkage off - using GLM estimates for parameters
#> Adjusting the data
all(rownames(coldata2) %in% colnames(cts2))
#> [1] TRUE
cts2 <- cts2[, rownames(coldata2)]

run DESeq2

The batch corrected read counts are then used for running DESeq2 analyses

dds <- DESeqDataSetFromMatrix(countData = adjusted_counts,
                              colData = coldata2,
                              design = ~ group)

transformation

vsd <- vst(dds, blind = FALSE)
pcaData <-
  plotPCA(vsd,
          intgroup = c("group", "authors"),
          returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

PCA plot (differenticated)

ggplotly(
  ggplot(pcaData, aes(
    PC1, PC2, color = group.1, shape = authors
  )) +
    scale_shape_manual(values = 1:8) +
    theme_bw() +
    theme(legend.title = element_blank()) +
    geom_point(size = 1, stroke = 1) +
    xlab(paste0("PC1: ", percentVar[1], "% variance")) +
    ylab(paste0("PC2: ", percentVar[2], "% variance"))
)

Fig 3.2: PCA plots (differenticated)

Session Information

sessionInfo()
#> R version 4.0.5 (2021-03-31)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19042)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United States.1252 
#> [2] LC_CTYPE=English_United States.1252   
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.1252    
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#>  [1] plotly_4.9.3                RColorBrewer_1.1-2         
#>  [3] ggrepel_0.9.1               pheatmap_1.0.12            
#>  [5] vsn_3.58.0                  DESeq2_1.30.1              
#>  [7] SummarizedExperiment_1.20.0 Biobase_2.50.0             
#>  [9] MatrixGenerics_1.2.1        matrixStats_0.58.0         
#> [11] GenomicRanges_1.42.0        GenomeInfoDb_1.26.7        
#> [13] IRanges_2.24.1              S4Vectors_0.28.1           
#> [15] BiocGenerics_0.36.1         forcats_0.5.1              
#> [17] stringr_1.4.0               dplyr_1.0.7                
#> [19] purrr_0.3.4                 readr_1.4.0                
#> [21] tidyr_1.1.3                 tibble_3.1.1               
#> [23] ggplot2_3.3.5               tidyverse_1.3.1            
#> [25] sva_3.38.0                  BiocParallel_1.24.1        
#> [27] genefilter_1.72.1           mgcv_1.8-35                
#> [29] nlme_3.1-152               
#> 
#> loaded via a namespace (and not attached):
#>  [1] colorspace_2.0-1       ellipsis_0.3.2         XVector_0.30.0        
#>  [4] fs_1.5.0               rstudioapi_0.13        farver_2.1.0          
#>  [7] affyio_1.60.0          DT_0.18                bit64_4.0.5           
#> [10] AnnotationDbi_1.52.0   fansi_0.4.2            lubridate_1.7.10      
#> [13] xml2_1.3.2             splines_4.0.5          cachem_1.0.5          
#> [16] geneplotter_1.68.0     knitr_1.33             jsonlite_1.7.2        
#> [19] broom_0.7.6            annotate_1.68.0        dbplyr_2.1.1          
#> [22] BiocManager_1.30.15    compiler_4.0.5         httr_1.4.2            
#> [25] backports_1.2.1        assertthat_0.2.1       Matrix_1.3-3          
#> [28] fastmap_1.1.0          lazyeval_0.2.2         limma_3.46.0          
#> [31] cli_2.5.0              htmltools_0.5.1.1      tools_4.0.5           
#> [34] gtable_0.3.0           glue_1.4.2             GenomeInfoDbData_1.2.4
#> [37] affy_1.68.0            Rcpp_1.0.6             cellranger_1.1.0      
#> [40] jquerylib_0.1.4        vctrs_0.3.8            preprocessCore_1.52.1 
#> [43] crosstalk_1.1.1        xfun_0.22              rvest_1.0.0           
#> [46] lifecycle_1.0.0        XML_3.99-0.6           edgeR_3.32.1          
#> [49] zlibbioc_1.36.0        scales_1.1.1           hms_1.1.0             
#> [52] yaml_2.2.1             memoise_2.0.0          sass_0.4.0            
#> [55] stringi_1.6.2          RSQLite_2.2.7          rlang_0.4.11          
#> [58] pkgconfig_2.0.3        bitops_1.0-7           evaluate_0.14         
#> [61] lattice_0.20-44        labeling_0.4.2         htmlwidgets_1.5.3     
#> [64] bit_4.0.4              tidyselect_1.1.1       magrittr_2.0.1        
#> [67] bookdown_0.22          R6_2.5.0               generics_0.1.0        
#> [70] DelayedArray_0.16.3    DBI_1.1.1              pillar_1.6.1          
#> [73] haven_2.4.1            withr_2.4.2            survival_3.2-11       
#>  [ reached getOption("max.print") -- omitted 17 entries ]